Project
Project
Introduction
Problem Statement
Data Collection
#Importing requied libraries
library(readr)
library(tidyverse)
library(tidymodels)
library(ggplot2)
library(dplyr)
library(caret)
library(e1071)
library(rpart)
crop <- read.csv2("Cropdata.csv", header = TRUE, sep = ",")
View(crop)str(crop)## 'data.frame': 902 obs. of 7 variables:
## $ Time.line: Factor w/ 5 levels "2014-2015","2015-2016",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ PH : num 7.86 7.71 8.01 7.83 8.11 7.53 8.11 7.3 7.69 7.53 ...
## $ EC : num 0.931 0.694 1.11 1.09 1.14 1.02 1.14 1.02 0.921 1.02 ...
## $ N : num 168 100 91 100 82 ...
## $ P : num 17 19.9 16 19.9 17 19.9 17 19.9 17 19.9 ...
## $ k : num 32 43.6 99 43.6 102 43.6 1.2 43.6 36 43.6 ...
## $ Total : num 226 172 215 173 210 ...
head(crop)## Time.line PH EC N P k Total
## 1 2014-2015 7.86 0.931 168.0 17.0 32.0 225.791
## 2 2014-2015 7.71 0.694 100.1 19.9 43.6 172.004
## 3 2014-2015 8.01 1.110 91.0 16.0 99.0 215.120
## 4 2014-2015 7.83 1.090 100.1 19.9 43.6 172.520
## 5 2014-2015 8.11 1.140 82.0 17.0 102.0 210.250
## 6 2014-2015 7.53 1.020 100.1 19.9 43.6 172.150
summary(crop)## Time.line PH EC N
## 2014-2015:219 Min. : 0.36 Min. : 0.0090 Min. : 1.81
## 2015-2016:154 1st Qu.: 7.72 1st Qu.: 0.8152 1st Qu.: 100.10
## 2017-2018:224 Median : 7.96 Median : 1.0050 Median : 152.00
## 2018-2019:114 Mean : 18.70 Mean : 7.2213 Mean : 139.50
## 2019-2020:191 3rd Qu.: 8.10 3rd Qu.: 1.0900 3rd Qu.: 169.00
## Max. :7388.00 Max. :952.0000 Max. :1725.00
## P k Total
## Min. : 0.13 Min. : 0.20 Min. : 51.9
## 1st Qu.: 14.00 1st Qu.: 42.00 1st Qu.: 172.7
## Median : 17.00 Median : 43.60 Median : 224.0
## Mean : 26.37 Mean : 57.85 Mean : 249.4
## 3rd Qu.: 19.90 3rd Qu.: 81.00 3rd Qu.: 282.4
## Max. :1282.00 Max. :641.00 Max. :7552.6
crop$Total <- round(crop$Total,0)
#**************************************************Step_1*******************************************
#The first step is create two new columns as follows:
# Categories in grade coloumn- Converting grades into low or high risk
crop_new <- mutate(crop,
Crop_Type = case_when(Total %in% 1:200 ~ "Ground Nut",
Total %in% 200:214 ~ "Sugar Cane",
Total %in% 215:235 ~ "Grape",
Total %in% 236:244 ~ "Onion",
Total %in% 245:250 ~ "Banana",
Total %in% 251:100000 ~ "Turmeric"))
#Creating a csv file
write.table(crop_new, file = "crop_new.csv",
sep = ",",
row.names = FALSE)
View(crop_new)Data Preparation
sample_set <- sample(2, nrow(crop_new),
replace = TRUE,
prob = c(0.7, 0.3))
train <- crop_new[sample_set==1,]
head(train)## Time.line PH EC N P k Total Crop_Type
## 2 2014-2015 7.71 0.694 100.1 19.9 43.6 172 Ground Nut
## 3 2014-2015 8.01 1.110 91.0 16.0 99.0 215 Grape
## 4 2014-2015 7.83 1.090 100.1 19.9 43.6 173 Ground Nut
## 5 2014-2015 8.11 1.140 82.0 17.0 102.0 210 Sugar Cane
## 7 2014-2015 8.11 1.140 82.0 17.0 1.2 109 Ground Nut
## 9 2014-2015 7.69 0.921 76.0 17.0 36.0 138 Ground Nut
#Creating a csv file
write.table(train, file = "crop_train.csv",
sep = ",",
row.names = FALSE)
test <- crop_new[sample_set==2,]
head(test)## Time.line PH EC N P k Total Crop_Type
## 1 2014-2015 7.86 0.931 168.0 17.0 32.0 226 Grape
## 6 2014-2015 7.53 1.020 100.1 19.9 43.6 172 Ground Nut
## 8 2014-2015 7.30 1.020 100.1 19.9 43.6 172 Ground Nut
## 13 2014-2015 8.39 6.000 100.1 19.9 43.6 178 Ground Nut
## 18 2014-2015 8.14 1.120 93.0 17.0 112.0 231 Grape
## 22 2014-2015 8.11 1.100 83.0 17.0 115.0 224 Grape
#Creating a csv file
write.table(crop_new, file = "test.csv",
sep = ",",
row.names = FALSE)Data Cleaning
library(DataExplorer)
sum(is.na(train))## [1] 0
sum(is.na(test))## [1] 0
plot_missing(train) # Exploratory Data Analysis (EDA) * describe - can computes the statistics of all numerical variables
library(Hmisc)## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:e1071':
##
## impute
## The following object is masked from 'package:parsnip':
##
## translate
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
describe(train)## train
##
## 8 Variables 629 Observations
## --------------------------------------------------------------------------------
## Time.line
## n missing distinct
## 629 0 5
##
## lowest : 2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
## highest: 2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
##
## Value 2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
## Frequency 166 94 159 79 131
## Proportion 0.264 0.149 0.253 0.126 0.208
## --------------------------------------------------------------------------------
## PH
## n missing distinct Info Mean Gmd .05 .10
## 629 0 155 0.999 11.62 7.951 7.310 7.580
## .25 .50 .75 .90 .95
## 7.720 7.950 8.100 8.182 8.600
##
## lowest : 0.36000 1.12000 1.15000 2.06000 2.72000
## highest: 18.60000 18.70026 765.00000 768.00000 822.00000
##
## Value 0 10 20 760 770 820
## Frequency 6 618 2 1 1 1
## Proportion 0.010 0.983 0.003 0.002 0.002 0.002
##
## For the frequency table, variable is rounded to the nearest 10
## --------------------------------------------------------------------------------
## EC
## n missing distinct Info Mean Gmd .05 .10
## 629 0 229 0.999 3.722 5.939 0.170 0.328
## .25 .50 .75 .90 .95
## 0.831 0.994 1.090 1.282 1.840
##
## lowest : 0.009 0.040 0.050 0.060 0.070
## highest: 101.000 124.000 168.000 248.000 951.000
##
## Value 0 10 20 80 100 120 170 250 950
## Frequency 612 10 1 1 1 1 1 1 1
## Proportion 0.973 0.016 0.002 0.002 0.002 0.002 0.002 0.002 0.002
##
## For the frequency table, variable is rounded to the nearest 10
## --------------------------------------------------------------------------------
## N
## n missing distinct Info Mean Gmd .05 .10
## 629 0 84 0.938 139 45.78 100.1 100.1
## .25 .50 .75 .90 .95
## 100.1 152.0 169.0 179.0 189.0
##
## lowest : 1.81 7.87 8.00 15.00 15.40
## highest: 196.00 197.00 198.00 199.00 1725.00
## --------------------------------------------------------------------------------
## P
## n missing distinct Info Mean Gmd .05 .10
## 629 0 44 0.963 28.32 26.98 9.0 10.8
## .25 .50 .75 .90 .95
## 14.0 17.0 19.9 19.9 153.0
##
## lowest : 0.13 1.00 5.00 7.00 8.00
## highest: 171.00 172.00 173.00 178.00 1282.00
## --------------------------------------------------------------------------------
## k
## n missing distinct Info Mean Gmd .05 .10
## 629 0 94 0.966 56.33 30.67 14.0 25.0
## .25 .50 .75 .90 .95
## 41.0 43.6 81.0 95.0 99.0
##
## lowest : 0.2 1.2 4.0 6.0 7.0, highest: 122.0 129.0 146.0 160.0 196.0
## --------------------------------------------------------------------------------
## Total
## n missing distinct Info Mean Gmd .05 .10
## 629 0 156 0.992 239 79.29 172.0 172.0
## .25 .50 .75 .90 .95
## 173.0 223.0 281.0 296.0 307.6
##
## lowest : 78 98 106 109 113, highest: 1045 1079 1122 1397 1857
## --------------------------------------------------------------------------------
## Crop_Type
## n missing distinct
## 629 0 6
##
## lowest : Banana Grape Ground Nut Onion Sugar Cane
## highest: Grape Ground Nut Onion Sugar Cane Turmeric
##
## Value Banana Grape Ground Nut Onion Sugar Cane Turmeric
## Frequency 13 70 233 21 46 246
## Proportion 0.021 0.111 0.370 0.033 0.073 0.391
## --------------------------------------------------------------------------------
describe(test)## test
##
## 8 Variables 273 Observations
## --------------------------------------------------------------------------------
## Time.line
## n missing distinct
## 273 0 5
##
## lowest : 2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
## highest: 2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
##
## Value 2014-2015 2015-2016 2017-2018 2018-2019 2019-2020
## Frequency 53 60 65 35 60
## Proportion 0.194 0.220 0.238 0.128 0.220
## --------------------------------------------------------------------------------
## PH
## n missing distinct Info Mean Gmd .05 .10
## 273 0 104 0.999 35.02 54.81 7.306 7.592
## .25 .50 .75 .90 .95
## 7.720 7.960 8.100 8.250 8.796
##
## lowest : 0.46000 0.81000 3.00000 6.08100 6.34000
## highest: 9.23000 9.91000 18.60000 18.70026 7388.00000
##
## Value 0 7400
## Frequency 272 1
## Proportion 0.996 0.004
##
## For the frequency table, variable is rounded to the nearest 100
## --------------------------------------------------------------------------------
## EC
## n missing distinct Info Mean Gmd .05 .10
## 273 0 132 0.999 15.28 28.7 0.168 0.350
## .25 .50 .75 .90 .95
## 0.810 1.010 1.090 1.288 3.104
##
## lowest : 0.009 0.070 0.080 0.090 0.110
## highest: 151.000 691.000 898.000 921.000 952.000
##
## Value 0 10 120 150 690 900 920 950
## Frequency 261 5 1 2 1 1 1 1
## Proportion 0.956 0.018 0.004 0.007 0.004 0.004 0.004 0.004
##
## For the frequency table, variable is rounded to the nearest 10
## --------------------------------------------------------------------------------
## N
## n missing distinct Info Mean Gmd .05 .10
## 273 0 58 0.949 140.7 40.22 100.1 100.1
## .25 .50 .75 .90 .95
## 100.1 156.0 169.0 181.0 189.4
##
## lowest : 8 78 83 84 88, highest: 194 195 196 198 275
## --------------------------------------------------------------------------------
## P
## n missing distinct Info Mean Gmd .05 .10
## 273 0 24 0.958 21.87 14.32 9.6 12.0
## .25 .50 .75 .90 .95
## 14.0 17.0 19.9 19.9 19.9
##
## lowest : 7 8 9 10 11, highest: 163 164 168 171 172
## --------------------------------------------------------------------------------
## k
## n missing distinct Info Mean Gmd .05 .10
## 273 0 70 0.961 61.34 34.08 23.6 34.0
## .25 .50 .75 .90 .95
## 43.6 43.6 81.0 96.0 105.6
##
## lowest : 0.87 1.00 12.00 14.00 15.00, highest: 128.00 130.00 144.00 175.00 641.00
## --------------------------------------------------------------------------------
## Total
## n missing distinct Info Mean Gmd .05 .10
## 273 0 106 0.993 273.4 141 172.0 172.0
## .25 .50 .75 .90 .95
## 173.0 225.0 284.0 302.6 315.0
##
## lowest : 52 165 171 172 173, highest: 908 1100 1112 1152 7553
##
## Value 100 200 300 400 500 800 900 1100 1200 7600
## Frequency 1 156 108 1 1 1 1 2 1 1
## Proportion 0.004 0.571 0.396 0.004 0.004 0.004 0.004 0.007 0.004 0.004
##
## For the frequency table, variable is rounded to the nearest 100
## --------------------------------------------------------------------------------
## Crop_Type
## n missing distinct
## 273 0 6
##
## lowest : Banana Grape Ground Nut Onion Sugar Cane
## highest: Grape Ground Nut Onion Sugar Cane Turmeric
##
## Value Banana Grape Ground Nut Onion Sugar Cane Turmeric
## Frequency 4 33 98 5 17 116
## Proportion 0.015 0.121 0.359 0.018 0.062 0.425
## --------------------------------------------------------------------------------
- Two continuous variables
- Taking PH & EC
library(ggplot2)
q <- ggplot(data = train, aes(x =Time.line , y = log(PH) ))+
geom_line(colour = "darkgreen") +
geom_point(aes(colour = factor(Crop_Type)), size =3) +
geom_point(colour = "grey90", size = 1.5)+
labs(title = 'Crop according to PH for Time.line 2015-2020',
y='PH of the soil',x='Time.line')
qlibrary(plotly)
fig <- train %>%
plot_ly(
x = ~log(PH),
y = ~log(P),
size = ~k,
color = ~Crop_Type,
frame = ~Time.line,
text = ~P,
hoverinfo = "text",
type = 'scatter',
mode = 'markers'
)
fig <- fig %>% layout(
xaxis = list(
type = "log"
)
)
figplot_ly(train, x = ~log(PH), y = ~Crop_Type ,
type = 'scatter',
mode = 'markers',
marker = list(color = "darkgreen" ), opacity = 0.5) %>%
layout(title = 'Crop according to PH for Time.line 2015-2020',
yaxis = list(title = 'Time.line'),
xaxis = list(title = 'PH of the soil ') )Boosting Algorithms
train$Crop_Type <- as.factor(train$Crop_Type)
library(mlbench)
library(caret)
# Example of Boosting Algorithms
control <- trainControl(method="repeatedcv", number=10, repeats=3)
seed <- 7
metric <- "Accuracy"Modelling
SvmRadial
set.seed(seed)
fit.svmRadial <- train(Crop_Type~., data=train, method="svmRadial", metric=metric, trControl=control)
fit.svmRadial## Support Vector Machines with Radial Basis Function Kernel
##
## 629 samples
## 7 predictor
## 6 classes: 'Banana', 'Grape', 'Ground Nut', 'Onion', 'Sugar Cane', 'Turmeric'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 566, 566, 564, 567, 565, 567, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.8245223 0.7336092
## 0.50 0.8340136 0.7524886
## 1.00 0.8542609 0.7839876
##
## Tuning parameter 'sigma' was held constant at a value of 0.3331884
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.3331884 and C = 1.
Stochastic Gradient Boosting
# Stochastic Gradient Boosting
set.seed(seed)
fit.gbm <- train(Crop_Type~., data=train, method="gbm", metric=metric, trControl=control, verbose=FALSE)
fit.gbm## Stochastic Gradient Boosting
##
## 629 samples
## 7 predictor
## 6 classes: 'Banana', 'Grape', 'Ground Nut', 'Onion', 'Sugar Cane', 'Turmeric'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 566, 566, 564, 567, 565, 567, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa
## 1 50 0.9973620 0.9961856
## 1 100 0.9973620 0.9961955
## 1 150 0.9968411 0.9954484
## 2 50 0.9968497 0.9954635
## 2 100 0.9957912 0.9939446
## 2 150 0.9957912 0.9939446
## 3 50 0.9973708 0.9962215
## 3 100 0.9958080 0.9939753
## 3 150 0.9952704 0.9932036
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 50, interaction.depth =
## 3, shrinkage = 0.1 and n.minobsinnode = 10.
kNN
# kNN
set.seed(seed)
fit.knn <- train(Crop_Type~., data=train, method="knn", metric=metric, preProc=c("center", "scale"), trControl=control)
fit.knn## k-Nearest Neighbors
##
## 629 samples
## 7 predictor
## 6 classes: 'Banana', 'Grape', 'Ground Nut', 'Onion', 'Sugar Cane', 'Turmeric'
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 566, 566, 564, 567, 565, 567, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.8928668 0.8448428
## 7 0.8807039 0.8269661
## 9 0.8780339 0.8219489
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
Random Forest
# Random Forest
set.seed(seed)
fit.rf <- train(Crop_Type~., data=train, method="rf", metric=metric, trControl=control)
fit.rf## Random Forest
##
## 629 samples
## 7 predictor
## 6 classes: 'Banana', 'Grape', 'Ground Nut', 'Onion', 'Sugar Cane', 'Turmeric'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 566, 566, 564, 567, 565, 567, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9649931 0.9492937
## 6 0.9936324 0.9907591
## 10 0.9984207 0.9977162
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.
Model Selection
summarize results
# summarize results
boosting_results <- resamples(list(svmRadial=fit.svmRadial, gbm=fit.gbm, knn =fit.knn, rf =fit.rf))
boosting_results##
## Call:
## resamples.default(x = list(svmRadial = fit.svmRadial, gbm = fit.gbm, knn
## = fit.knn, rf = fit.rf))
##
## Models: svmRadial, gbm, knn, rf
## Number of resamples: 30
## Performance metrics: Accuracy, Kappa
## Time estimates for: everything, final model fit
summary(boosting_results)##
## Call:
## summary.resamples(object = boosting_results)
##
## Models: svmRadial, gbm, knn, rf
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## svmRadial 0.812500 0.8387097 0.8571429 0.8542609 0.8704389 0.890625 0
## gbm 0.983871 1.0000000 1.0000000 0.9973708 1.0000000 1.000000 0
## knn 0.812500 0.8730159 0.8977667 0.8928668 0.9058780 0.952381 0
## rf 0.983871 1.0000000 1.0000000 0.9984207 1.0000000 1.000000 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## svmRadial 0.7261056 0.7622753 0.7879185 0.7839876 0.8056046 0.8391960 0
## gbm 0.9768484 1.0000000 1.0000000 0.9962215 1.0000000 1.0000000 0
## knn 0.7305263 0.8157818 0.8495089 0.8448428 0.8641996 0.9292929 0
## rf 0.9765329 1.0000000 1.0000000 0.9977162 1.0000000 1.0000000 0
dotplot(boosting_results) # Bagging Algorithms